In [1]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier,plot_importance
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import plotly.io as pio
pio.renderers.default='notebook'

Data Exploration¶

In [2]:
df = pd.read_csv("term-deposit-marketing-2020.csv")
print(df)
       age           job   marital  education default  balance housing loan  \
0       58    management   married   tertiary      no     2143     yes   no   
1       44    technician    single  secondary      no       29     yes   no   
2       33  entrepreneur   married  secondary      no        2     yes  yes   
3       47   blue-collar   married    unknown      no     1506     yes   no   
4       33       unknown    single    unknown      no        1      no   no   
...    ...           ...       ...        ...     ...      ...     ...  ...   
39995   53    technician   married   tertiary      no      395      no   no   
39996   30    management    single   tertiary      no     3340      no   no   
39997   54         admin  divorced  secondary      no      200      no   no   
39998   34    management   married   tertiary      no     1047      no   no   
39999   38    technician   married  secondary      no     1442     yes   no   

        contact  day month  duration  campaign    y  
0       unknown    5   may       261         1   no  
1       unknown    5   may       151         1   no  
2       unknown    5   may        76         1   no  
3       unknown    5   may        92         1   no  
4       unknown    5   may       198         1   no  
...         ...  ...   ...       ...       ...  ...  
39995  cellular    3   jun       107         1   no  
39996  cellular    3   jun       238         3  yes  
39997  cellular    3   jun       170         1  yes  
39998  cellular    3   jun       342         1   no  
39999  cellular    3   jun       113         1   no  

[40000 rows x 14 columns]
In [3]:
df.head()
Out[3]:
age job marital education default balance housing loan contact day month duration campaign y
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 no
1 44 technician single secondary no 29 yes no unknown 5 may 151 1 no
2 33 entrepreneur married secondary no 2 yes yes unknown 5 may 76 1 no
3 47 blue-collar married unknown no 1506 yes no unknown 5 may 92 1 no
4 33 unknown single unknown no 1 no no unknown 5 may 198 1 no
In [4]:
df.tail()
Out[4]:
age job marital education default balance housing loan contact day month duration campaign y
39995 53 technician married tertiary no 395 no no cellular 3 jun 107 1 no
39996 30 management single tertiary no 3340 no no cellular 3 jun 238 3 yes
39997 54 admin divorced secondary no 200 no no cellular 3 jun 170 1 yes
39998 34 management married tertiary no 1047 no no cellular 3 jun 342 1 no
39999 38 technician married secondary no 1442 yes no cellular 3 jun 113 1 no
In [5]:
df.describe()
Out[5]:
age balance day duration campaign
count 40000.000000 40000.000000 40000.000000 40000.000000 40000.000000
mean 40.544600 1274.277550 16.017225 254.824300 2.882175
std 9.641776 2903.769716 8.278127 259.366498 3.239051
min 19.000000 -8019.000000 1.000000 0.000000 1.000000
25% 33.000000 54.000000 8.000000 100.000000 1.000000
50% 39.000000 407.000000 17.000000 175.000000 2.000000
75% 48.000000 1319.000000 21.000000 313.000000 3.000000
max 95.000000 102127.000000 31.000000 4918.000000 63.000000
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  int64 
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  object
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  y          40000 non-null  object
dtypes: int64(5), object(9)
memory usage: 4.3+ MB

Data cleaning¶

1- Check for missing values

In [7]:
df.isnull().sum()
Out[7]:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
y            0
dtype: int64

No missing values are present

2- Checking outliers

In [8]:
fig = px.histogram(df[df.age<70], x="age")
fig.show()

We can see that the majority of people have age between 30 and 40. After 60 only few samples are present

In [9]:
fig = px.histogram(df[(-3000<= df['balance']) & (df['balance']<=10000)], x="balance",nbins=10)
fig

The majority of the people have the balance in the range between 0 and 2k with very few samples after 6k.

In [10]:
fig = px.histogram(df['day'], x="day")
fig
In [11]:
counts, bins = np.histogram(df.duration, bins=range(0, 1000, 30))
bins = 0.5 * (bins[:-1] + bins[1:])
fig = px.bar(x=bins, y=counts, labels={'x':'duration', 'y':'count'})
fig

The duration of the last contact seems to be in the range of 500 seconds for most people, the number of people decrease constantly after that

In [12]:
counts, bins = np.histogram(df.campaign, bins=range(0, 21, 6))
bins = 0.5 * (bins[:-1] + bins[1:])
fig = px.bar(x=bins, y=counts, labels={'x':'campaign', 'y':'count'})
fig

The number of contacts performed during this campaign seems to be approximately between one and 1 and 3 for most client rarely exceeding 13.

Visualize data and features engineering¶

In [13]:
subscriber_counts = df['y'].value_counts()
fig = go.Figure(data=[go.Pie(labels=subscriber_counts.keys().tolist(), values=subscriber_counts)])
fig
In [14]:
df['y']=pd.factorize(df['y'])[0]
In [15]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  int64 
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  object
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  y          40000 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 4.3+ MB
In [16]:
df_sub = df[df.y==1]
df_sub.shape
Out[16]:
(2896, 14)
In [17]:
df_not_sub = df[df.y==0]
df_not_sub.shape
Out[17]:
(37104, 14)

The data are unbalanced with only 2896 subscriber aganist 37104 non subscriber. We need to remember this in the classification task.

Age¶

In [18]:
label = ['18-20','20-25', '25-30', '30-35','35-40','40-45','45-50','50-55','55-60','60-65','65-70','>70']
num_samp_age = [(df[(18 <= df['age']) & (df['age'] <= 20)]).shape[0],(df[(20 < df['age']) & (df['age'] <= 25)]).shape[0],(df[(25 < df['age']) & (df['age'] <= 30)]).shape[0],(df[(30 < df['age']) & (df['age'] <= 35)]).shape[0],(df[(35 < df['age']) & (df['age'] <= 40)]).shape[0],(df[(40 < df['age']) & (df['age'] <= 45)]).shape[0],(df[(45 < df['age']) & (df['age'] <= 50)]).shape[0],(df[(50 < df['age']) & (df['age'] <= 55)]).shape[0],(df[(55 < df['age']) & (df['age'] <= 60)]).shape[0],(df[(60 < df['age']) & (df['age'] <= 65)]).shape[0],(df[(65 < df['age']) & (df['age'] <= 70)]).shape[0],(df[(df['age'] > 70)]).shape[0]]
perc_age = [round((df[(18 <= df['age']) & (df['age'] <= 20)& (df['y'] == 1)]).shape[0]/num_samp_age[0]*100,2),round((df[(20 < df['age']) & (df['age'] <= 25)& (df['y'] == 1)]).shape[0]/num_samp_age[1]*100,2),round((df[(25 < df['age']) & (df['age'] <= 30)& (df['y'] == 1)]).shape[0]/num_samp_age[2]*100,2),round((df[(30 < df['age']) & (df['age'] <= 35)& (df['y'] == 1)]).shape[0]/num_samp_age[3]*100,2),round((df[(35 < df['age']) & (df['age'] <= 40)& (df['y'] == 1)]).shape[0]/num_samp_age[4]*100,2),round((df[(40 < df['age']) & (df['age'] <= 45)& (df['y'] == 1)]).shape[0]/num_samp_age[5]*100,2),round((df[(45 < df['age']) & (df['age'] <= 50)& (df['y'] == 1)]).shape[0]/num_samp_age[6]*100,2),round((df[(50 < df['age']) & (df['age'] <= 55)& (df['y'] == 1)]).shape[0]/num_samp_age[7]*100,2),round((df[(55 < df['age']) & (df['age'] <= 60)& (df['y'] == 1)]).shape[0]/num_samp_age[8]*100,2),round((df[(60 < df['age']) & (df['age'] <= 65)& (df['y'] == 1)]).shape[0]/num_samp_age[9]*100,2),round((df[(65 < df['age']) & (df['age'] <= 70)& (df['y'] == 1)]).shape[0]/num_samp_age[10]*100,2),round((df[(df['age'] > 70)& (df['y'] == 1)]).shape[0]/num_samp_age[11]*100,2)]
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_age, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_age, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Age feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Age range")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

The subscriber percentage increase after 60 years old

In [19]:
label = ['18-30','30-40', '40-50', '50-60','>60']
num_samp_age = [(df[(18 <= df['age']) & (df['age'] <= 30)]).shape[0],(df[(30 < df['age']) & (df['age'] <= 40)]).shape[0],(df[(40 < df['age']) & (df['age'] <= 50)]).shape[0],(df[(50 < df['age']) & (df['age'] <= 60)]).shape[0],(df[(df['age'] > 60)]).shape[0]]
perc_age = [round((df[(18 <= df['age']) & (df['age'] <= 30)& (df['y'] == 1)]).shape[0]/num_samp_age[0]*100,2),round((df[(30 < df['age']) & (df['age'] <= 40)& (df['y'] == 1)]).shape[0]/num_samp_age[1]*100,2),round((df[(40 < df['age']) & (df['age'] <= 50)& (df['y'] == 1)]).shape[0]/num_samp_age[2]*100,2),round((df[(50 < df['age']) & (df['age'] <= 60)& (df['y'] == 1)]).shape[0]/num_samp_age[3]*100,2),round((df[(df['age'] > 60)& (df['y'] == 1)]).shape[0]/num_samp_age[4]*100,2)]
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_age, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_age, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Age feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Age range")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig
In [20]:
def func(x):
    if x >= 18 and x<=30:
        return '18_30'
    elif x > 30 and x<=40:
        return '30_40'
    elif x > 40 and x<=50:
        return '40_50'
    elif x > 50 and x<=60:
        return '50_60'
    else:
        return 'over60'
df['age'] = df['age'].apply(func)

Job¶

In [21]:
print(df['job'].unique()) 
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
In [22]:
print(df['job'].value_counts()) 
job
blue-collar      9383
management       8166
technician       6852
admin            4483
services         3910
retired          1437
self-employed    1414
entrepreneur     1405
unemployed       1104
housemaid        1087
student           524
unknown           235
Name: count, dtype: int64
In [23]:
label = ['management','technician','entrepreneur','blue-collar','unknown','retired','admin','services','self-employed','unemployed','housemaid','student']
num_samp_job = []
perc_job = []
for l in label:
    num_samp_job.append(df[df.job==l].shape[0])
for i,l in enumerate(label):
    perc_job.append(round((df[(df.job==l) & (df['y'] == 1)]).shape[0]/num_samp_job[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_job, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_job, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Job feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Type of job")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

Student and retired seems to have the larger number of subscriber. Given the number of categories and samples it would be better to merge categories together. The categories unemployed, housemaid, student, unknown, retired and self-employed will be grouped together in the category 'Not full-time job'. The categories admin, technician, services will be grouped together in the category 'Office job' and the categories entrepreneur, management, blue-collar will be grouped together in the category 'High profile job'.

In [24]:
d = {'technician':'office_job','admin':'office_job','services':'office_job','unemployed':'not_full_time_job','housemaid':'not_full_time_job','student':'not_full_time_job','unknown':'not_full_time_job','retired':'not_full_time_job','self-employed':'not_full_time_job','entrepreneur':'high_profile_job','management':'high_profile_job','blue-collar':'high_profile_job'}
df['job'] = df['job'].replace(d)
df_sub['job'] = df_sub['job'].replace(d)
df_not_sub['job'] = df_not_sub['job'].replace(d)
C:\Users\PicardiC\AppData\Local\Temp\ipykernel_9684\2141811894.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\PicardiC\AppData\Local\Temp\ipykernel_9684\2141811894.py:4: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [25]:
print(df['job'].unique()) 
print(df_sub['job'].unique())
print(df_not_sub['job'].unique())
['high_profile_job' 'office_job' 'not_full_time_job']
['office_job' 'high_profile_job' 'not_full_time_job']
['high_profile_job' 'office_job' 'not_full_time_job']
In [26]:
print(df['job'].value_counts()) 
job
high_profile_job     18954
office_job           15245
not_full_time_job     5801
Name: count, dtype: int64
In [27]:
label = ['high_profile_job','office_job','not_full_time_job' ]
num_samp_job = []
perc_job = []
for l in label:
    num_samp_job.append(df[df.job==l].shape[0])
for i,l in enumerate(label):
    perc_job.append(round((df[(df.job==l) & (df['y'] == 1)]).shape[0]/num_samp_job[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_job, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_job, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Job feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Type of job")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

Unexepectly the percentage of subscribers decrease with position grow, but this can be due to the limited number of sample in not a full-time job category.Also the result is determined by the two categories student and retired which have a larger number of subscriber.

Marital¶

In [28]:
print(df['marital'].unique()) 
['married' 'single' 'divorced']
In [29]:
print(df['marital'].value_counts()) 
marital
married     24386
single      10889
divorced     4725
Name: count, dtype: int64
In [30]:
label = ['married','single', 'divorced']
num_samp_mar = []
perc_mar = []
for l in label:
    num_samp_mar.append(df[df.marital==l].shape[0])
for i,l in enumerate(label):
    perc_mar.append(round((df[(df.marital==l) & (df['y'] == 1)]).shape[0]/num_samp_mar[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_mar, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_mar, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Marital feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Marital status")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

The greatest subscriber percentage is in the single people. The reason of this could be related to less expenses or just related to sample size.

Education¶

In [31]:
print(df['education'].unique()) 
['tertiary' 'secondary' 'unknown' 'primary']
In [32]:
label = ['primary', 'secondary', 'tertiary', 'unknown']
num_samp_ed = []
perc_ed = []
for l in label:
    num_samp_ed.append(df[df.education==l].shape[0])
for i,l in enumerate(label):
    perc_ed.append(round((df[(df.education==l) & (df['y'] == 1)]).shape[0]/num_samp_ed[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_ed, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_ed, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Education feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Type of education")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

The percentage of subscriber increases with the level of education. This feature seems to affect the target.

Default¶

In [33]:
print(df['default'].unique()) 
['no' 'yes']
In [34]:
print(df['default'].value_counts()) 
default
no     39191
yes      809
Name: count, dtype: int64
In [35]:
label = ['no', 'yes']
num_samp_def = []
perc_def = []
for l in label:
    num_samp_def.append((df[df.default==l]).shape[0])
for i,l in enumerate(label):
    perc_def.append(round((df[(df.default==l) & (df['y'] == 1)]).shape[0]/num_samp_def[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_def, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_def, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Default feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Account in default")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig
In [36]:
df['default']=pd.factorize(df['default'])[0]
In [37]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  object
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  int64 
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  y          40000 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 4.3+ MB
In [38]:
print(df['default'].value_counts())
default
0    39191
1      809
Name: count, dtype: int64

As expected the majority of subscriber doesn't have a default on the account.

Balance¶

In [39]:
label = ['-2000, 0','0, 2000','2000, 4000','4000, 6000','6000, 8000','8000, 10000', '>10000']
num_samp_bal = [(df[(-2000 <= df['balance']) & (df['balance'] <= 0)]).shape[0],(df[(0 < df['balance']) & (df['balance'] <= 2000)]).shape[0],(df[(2000 < df['balance']) & (df['balance'] <= 4000)]).shape[0],(df[(4000 < df['balance']) & (df['balance'] <= 6000)]).shape[0],(df[(6000 < df['balance']) & (df['balance'] <= 8000)]).shape[0],(df[(8000 < df['balance']) & (df['balance'] <= 10000)]).shape[0],(df[(df['balance'] > 10000)].shape[0])]
perc_bal = [round((df_sub[(-2000 <= df_sub['balance']) & (df_sub['balance'] <= 0)]).shape[0]/num_samp_bal[0]*100,2),round((df_sub[(0 < df_sub['balance']) & (df_sub['balance'] <= 2000)]).shape[0]/num_samp_bal[1]*100,2),round((df_sub[(2000 < df_sub['balance']) & (df_sub['balance'] <= 4000)]).shape[0]/num_samp_bal[2]*100,2),round((df_sub[(4000 < df_sub['balance']) & (df_sub['balance'] <= 6000)]).shape[0]/num_samp_bal[3]*100,2),round((df_sub[(6000 < df_sub['balance']) & (df_sub['balance'] <= 8000)]).shape[0]/num_samp_bal[4]*100,2),round((df_sub[(8000 < df_sub['balance']) & (df_sub['balance'] <= 10000)]).shape[0]/num_samp_bal[5]*100,2),round((df_sub[(df_sub['balance'] > 10000)]).shape[0]/num_samp_bal[6]*100,2)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_bal, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_bal, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Balance feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Balance")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

The percentage of subscriber increases in the range 2000-4000 and 4000-6000, then decreases again for the range 6000-8000 and 8000-10000 finally increasing in balance>10000. So it seems that subscriber are more in people with medium and high-balance. Considering after 6k we have only few samples let's do a different grouping.

In [40]:
label = ['-2000,0','0-2000','2000-6000','>6000']
num_samp_bal = [(df[(-2000 <= df['balance']) & (df['balance'] <= 0)]).shape[0],(df[(0 < df['balance']) & (df['balance'] <= 2000)]).shape[0],(df[(2000 < df['balance']) & (df['balance'] <= 6000)]).shape[0],(df[(df['balance'] > 6000)]).shape[0]]
perc_bal = [round((df_sub[(-2000 <= df_sub['balance']) & (df_sub['balance'] <= 0)]).shape[0]/num_samp_bal[0]*100,2),round((df_sub[(0 < df_sub['balance']) & (df_sub['balance'] <= 2000)]).shape[0]/num_samp_bal[1]*100,2),round((df_sub[(2000 < df_sub['balance']) & (df_sub['balance'] <= 6000)]).shape[0]/num_samp_bal[2]*100,2),round((df_sub[(df_sub['balance'] > 6000)]).shape[0]/num_samp_bal[3]*100,2)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_bal, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_bal, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Balance feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Balance")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig
In [41]:
def func(x):
    if x >= 0 and x<=-2000:
        return 'very_low_balance'
    if x > 0 and x<=2000:
        return 'medium_balance'
    elif x > 2000 and x<=6000:
        return 'high_balance'
    else:
        return 'very_high_balance'
df['balance'] = df['balance'].apply(func)

Housing¶

In [42]:
print(df['housing'].unique())
['yes' 'no']
In [43]:
print(df['housing'].value_counts())
housing
yes    24031
no     15969
Name: count, dtype: int64
In [44]:
label = ['yes','no']
num_samp_hou = []
perc_hou = []
for l in label:
    num_samp_hou.append((df[df.housing==l]).shape[0])
for i,l in enumerate(label):
    perc_hou.append(round((df[(df.housing==l) & (df['y'] == 1)]).shape[0]/num_samp_hou[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_hou, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_hou, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Housing feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Housing loan")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

As expected the percentage of subscriber is greater in people without house loan

In [45]:
df['housing']=pd.factorize(df['housing'])[0]
In [46]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  object
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  int64 
 5   balance    40000 non-null  object
 6   housing    40000 non-null  int64 
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  y          40000 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 4.3+ MB
In [47]:
print(df['housing'].value_counts())
housing
0    24031
1    15969
Name: count, dtype: int64

Loan¶

In [48]:
print(df['loan'].unique())
['no' 'yes']
In [49]:
print(df['loan'].value_counts())
loan
no     33070
yes     6930
Name: count, dtype: int64
In [50]:
label = ['yes','no']
num_samp_loan = []
perc_loan = []
for l in label:
    num_samp_loan.append((df[df.loan==l]).shape[0])
for i,l in enumerate(label):
    perc_loan.append(round((df[(df.loan==l) & (df['y'] == 1)]).shape[0]/num_samp_loan[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_loan, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_loan, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text= "Loan feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Personal loan")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

As expected the percentage of subscriber is greater in people without a personal loan.

In [51]:
df['loan']=pd.factorize(df['loan'])[0]
In [52]:
print(df['loan'].value_counts())
loan
0    33070
1     6930
Name: count, dtype: int64

Contact¶

In [53]:
print(df['contact'].unique())
['unknown' 'cellular' 'telephone']
In [54]:
print(df['contact'].value_counts())
contact
cellular     24914
unknown      12765
telephone     2321
Name: count, dtype: int64
In [55]:
label = ['cellular', 'telephone', 'unknown']
num_samp_con = []
perc_con = []
for l in label:
    num_samp_con.append((df[df.contact==l]).shape[0])
for i,l in enumerate(label):
    perc_con.append(round((df[(df.contact==l) & (df['y'] == 1)]).shape[0]/num_samp_con[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_con, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_con, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Contact feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Mean of contact")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

People contacted with cellular seem to have the greatest number of subscriber, but it could just be related to the number of samples. Let's drop this feature.

Day¶

In [56]:
label =  list(range(1,32))
num_samp_day = []
perc_day = []
for l in label:
    num_samp_day.append((df[df.day==l]).shape[0])
for i,l in enumerate(label):
    perc_day.append(round((df[(df.day==l) & (df['y'] == 1)]).shape[0]/num_samp_day[i]*100,2))
# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_day, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_day, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Day feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Day of contact")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

The percentage of subscriber seems to increase the 13th day and 22nd.

Month¶

In [57]:
df['month'].unique()
Out[57]:
array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
       'mar', 'apr'], dtype=object)
In [58]:
label = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec']
num_samp_mon = []
perc_mon = []
for l in label:
    num_samp_mon.append((df[df.month==l]).shape[0])
for i,l in enumerate(label):
    perc_mon.append(round((df[(df.month==l) & (df['y'] == 1)]).shape[0]/num_samp_mon[i]*100,2))
# Create Plot


fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_mon, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_mon, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Month feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Month of last contact")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

October and march have the gratest number of subscriber but this could be due just to the sample size, so this variable can be dropped.

Duration¶

In [59]:
label = ['0-60','60-120','120-180','180-240','240-300','300-360','360-420','420-480','480-540','540-600','600-660','660-720','720-780','780-840','840-900','900-960','>960']
num_samp_dur = [(df[(0 <= df['duration']) & (df['duration'] <= 60)]).shape[0],(df[(60 < df['duration']) & (df['duration'] <= 120)]).shape[0],(df[(120 < df['duration']) & (df['duration'] <= 180)]).shape[0],(df[(180 < df['duration']) & (df['duration'] <= 240)]).shape[0],(df[(240 < df['duration']) & (df['duration'] <= 300)]).shape[0],(df[(300 < df['duration']) & (df['duration'] <= 360)]).shape[0],(df[(360 < df['duration']) & (df['duration'] <= 420)]).shape[0],(df[(420 < df['duration']) & (df['duration'] <= 480)]).shape[0],(df[(480 < df['duration']) & (df['duration'] <= 540)]).shape[0],(df[(540 < df['duration']) & (df['duration'] <= 600)]).shape[0],(df[(600 < df['duration']) & (df['duration'] <= 660)]).shape[0],(df[(660 < df['duration']) & (df['duration'] <= 720)]).shape[0],(df[(720 < df['duration']) & (df['duration'] <= 780)]).shape[0],(df[(780 < df['duration']) & (df['duration'] <= 840)]).shape[0],(df[(840 < df['duration']) & (df['duration'] <= 900)]).shape[0],(df[(900 < df['duration']) & (df['duration'] <= 960)]).shape[0],(df[(df['duration'] > 960)].shape[0])]
perc_dur = [round((df_sub[(0 <= df_sub['duration']) & (df_sub['duration'] <= 60)]).shape[0]/num_samp_dur[0]*100,2),round((df_sub[(60 < df_sub['duration']) & (df_sub['duration'] <= 120)]).shape[0]/num_samp_dur[1]*100,2),round((df_sub[(120 < df_sub['duration']) & (df_sub['duration'] <= 180)]).shape[0]/num_samp_dur[2]*100,2),round((df_sub[(180 < df_sub['duration']) & (df_sub['duration'] <= 240)]).shape[0]/num_samp_dur[3]*100,2),round((df_sub[(240 < df_sub['duration']) & (df_sub['duration'] <= 300)]).shape[0]/num_samp_dur[4]*100,2),round((df_sub[(300 < df_sub['duration']) & (df_sub['duration'] <= 360)]).shape[0]/num_samp_dur[5]*100,2),round((df_sub[(360 < df_sub['duration']) & (df_sub['duration'] <= 420)]).shape[0]/num_samp_dur[6]*100,2),round((df_sub[(420 < df_sub['duration']) & (df_sub['duration'] <= 480)]).shape[0]/num_samp_dur[7]*100,2),round((df_sub[(480 < df_sub['duration']) & (df_sub['duration'] <= 540)]).shape[0]/num_samp_dur[8]*100,2),round((df_sub[(540 < df_sub['duration']) & (df_sub['duration'] <= 600)]).shape[0]/num_samp_dur[9]*100,2),round((df_sub[(600 < df_sub['duration']) & (df_sub['duration'] <= 660)]).shape[0]/num_samp_dur[10]*100,2),round((df_sub[(660 < df_sub['duration']) & (df_sub['duration'] <= 720)]).shape[0]/num_samp_dur[11]*100,2),round((df_sub[(720 < df_sub['duration']) & (df_sub['duration'] <= 780)]).shape[0]/num_samp_dur[12]*100,2),round((df_sub[(780 < df_sub['duration']) & (df_sub['duration'] <= 840)]).shape[0]/num_samp_dur[13]*100,2),round((df_sub[(840 < df_sub['duration']) & (df_sub['duration'] <= 900)]).shape[0]/num_samp_dur[14]*100,2),round((df_sub[(900 < df_sub['duration']) & (df_sub['duration'] <= 960)]).shape[0]/num_samp_dur[15]*100,2),round((df_sub[(df_sub['duration'] > 960)]).shape[0]/num_samp_dur[16]*100,2)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_dur, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_dur, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Duration feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Duration of last contact")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

The percentage of subscriber increase with the duration of the last contact. Let's try a different grouping

In [60]:
label = ['0-180','180-360','>360']
num_samp_duration = [(df[(0 <= df['duration']) & (df['duration'] <= 180)]).shape[0],(df[(180 < df['duration']) & (df['duration'] <= 360)]).shape[0],(df[(df['duration'] > 360)].shape[0])]
perc_duration = [round((df_sub[(0 <= df_sub['duration']) & (df_sub['duration'] <= 180)]).shape[0]/num_samp_duration[0]*100,2),round((df_sub[(180 < df_sub['duration']) & (df_sub['duration'] <= 360)]).shape[0]/num_samp_duration[1]*100,2),round((df_sub[(df_sub['duration'] > 360)]).shape[0]/num_samp_duration[2]*100,2)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_duration, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_duration, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Duration feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Duration of last contact")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig
In [61]:
def func(x):
    if x >= 0 and x<=180:
        return '0_3_minutes'
    elif x>180 and x<=360:
        return '3_6_minutes'
    else:
        return 'greater_than_6_minutes'
df['duration'] = df['duration'].apply(func)

Campaign¶

In [62]:
label = ['1-4','4-8','8-12','>12']
num_samp_cam = [(df[(1 <= df['campaign']) & (df['campaign'] <= 4)]).shape[0],(df[(4 < df['campaign']) & (df['campaign'] <= 8)]).shape[0],(df[(8 < df['campaign']) & (df['campaign'] <= 12)]).shape[0],(df[(df['campaign'] > 12)].shape[0])]
perc_cam = [round((df_sub[(1 <= df_sub['campaign']) & (df_sub['campaign'] <= 4)]).shape[0]/num_samp_cam[0]*100,2),round((df_sub[(4 < df_sub['campaign']) & (df_sub['campaign'] <= 8)]).shape[0]/num_samp_cam[1]*100,2),round((df_sub[(8 < df_sub['campaign']) & (df_sub['campaign'] <= 12)]).shape[0]/num_samp_cam[2]*100,2),round((df_sub[(df_sub['campaign'] > 12)]).shape[0]/num_samp_cam[3]*100,2)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=perc_cam, name="Subscriber percentage"),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=label, y=num_samp_cam, name="Number of samples",width=0.1),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Campaign feature"
)

# Set x-axis title
fig.update_xaxes(title_text="Number of contacts for the last campaign")

# Set y-axes titles
fig.update_yaxes(title_text="Subscriber percentage", secondary_y=False)
fig.update_yaxes(title_text="Number of samples", secondary_y=True)

fig

Apparently the number of subscriber decreases with the number of contacts in the campaign. This feature can be dropped.

Dropping feature and converting categorical¶

In [63]:
df = df.drop(columns=['contact','day','month','campaign'])
df.head()
Out[63]:
age job marital education default balance housing loan duration y
0 50_60 high_profile_job married tertiary 0 high_balance 0 0 3_6_minutes 0
1 40_50 office_job single secondary 0 medium_balance 0 0 0_3_minutes 0
2 30_40 high_profile_job married secondary 0 medium_balance 0 1 0_3_minutes 0
3 40_50 high_profile_job married unknown 0 medium_balance 0 0 0_3_minutes 0
4 30_40 not_full_time_job single unknown 0 medium_balance 1 0 3_6_minutes 0
In [64]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  object
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  int64 
 5   balance    40000 non-null  object
 6   housing    40000 non-null  int64 
 7   loan       40000 non-null  int64 
 8   duration   40000 non-null  object
 9   y          40000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 3.1+ MB
In [65]:
#scaler = MinMaxScaler()
#df['age'] = scaler.fit_transform(df['age'].values.reshape(-1,1))
#df['balance'] = scaler.fit_transform(df['balance'].values.reshape(-1,1))
#df['duration'] = scaler.fit_transform(df['duration'].values.reshape(-1,1))
In [66]:
df.describe()
Out[66]:
default housing loan y
count 40000.000000 40000.000000 40000.000000 40000.000000
mean 0.020225 0.399225 0.173250 0.072400
std 0.140771 0.489745 0.378468 0.259152
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 1.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
In [67]:
df_term = pd.get_dummies(df, columns = ['age','marital','education','duration','job','balance'])
df_term.head()
Out[67]:
default housing loan y age_18_30 age_30_40 age_40_50 age_50_60 age_over60 marital_divorced ... education_unknown duration_0_3_minutes duration_3_6_minutes duration_greater_than_6_minutes job_high_profile_job job_not_full_time_job job_office_job balance_high_balance balance_medium_balance balance_very_high_balance
0 0 0 0 0 False False False True False False ... False False True False True False False True False False
1 0 0 0 0 False False True False False False ... False True False False False False True False True False
2 0 0 1 0 False True False False False False ... False True False False True False False False True False
3 0 0 0 0 False False True False False False ... True True False False True False False False True False
4 0 1 0 0 False True False False False False ... True False True False False True False False True False

5 rows × 25 columns

Prepare data¶

In [68]:
y = df_term['y']
X = df_term.drop(columns = ['y'])
print(X.shape, y.shape)
(40000, 24) (40000,)
In [69]:
X.head()
Out[69]:
default housing loan age_18_30 age_30_40 age_40_50 age_50_60 age_over60 marital_divorced marital_married ... education_unknown duration_0_3_minutes duration_3_6_minutes duration_greater_than_6_minutes job_high_profile_job job_not_full_time_job job_office_job balance_high_balance balance_medium_balance balance_very_high_balance
0 0 0 0 False False False True False False True ... False False True False True False False True False False
1 0 0 0 False False True False False False False ... False True False False False False True False True False
2 0 0 1 False True False False False False True ... False True False False True False False False True False
3 0 0 0 False False True False False False True ... True True False False True False False False True False
4 0 1 0 False True False False False False False ... True False True False False True False False True False

5 rows × 24 columns

Train and evaluate different models¶

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
print(X_test.shape)
(8000, 24)
In [71]:
counter = Counter(y_train)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print(counter[0])
print(counter[1])
print('Estimate: %.3f' % estimate)
29669
2331
Estimate: 12.728
In [72]:
svm = SVC()
tree = DecisionTreeClassifier()
rf =  RandomForestClassifier(max_depth=2, random_state=0)
bst = XGBClassifier(n_estimators=5000, max_depth=5, learning_rate=0.1, objective='binary:logistic',scale_pos_weight=estimate)

# Training the models 
svm.fit(X_train, y_train)
tree.fit(X_train, y_train)
rf.fit(X_train, y_train)
bst.fit(X_train, y_train)

# Making predictions with each model
svm_preds = svm.predict(X_test)
tree_preds = tree.predict(X_test)
rf_preds = rf.predict(X_test)
bst_preds = bst.predict(X_test)
In [73]:
model_preds = {
    "Support Vector Machine": svm_preds,
    "Decision Tree": tree_preds,
    "Random forest": rf_preds,
    "XGBoost": bst_preds
}

for model, preds in model_preds.items():
    print(f"{model} Results:\n{classification_report(y_test, preds)}", sep="\n\n")
Support Vector Machine Results:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      7435
           1       0.45      0.02      0.03       565

    accuracy                           0.93      8000
   macro avg       0.69      0.51      0.50      8000
weighted avg       0.90      0.93      0.90      8000

Decision Tree Results:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      7435
           1       0.32      0.07      0.11       565

    accuracy                           0.92      8000
   macro avg       0.63      0.53      0.54      8000
weighted avg       0.89      0.92      0.90      8000

Random forest Results:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      7435
           1       0.00      0.00      0.00       565

    accuracy                           0.93      8000
   macro avg       0.46      0.50      0.48      8000
weighted avg       0.86      0.93      0.90      8000

XGBoost Results:
              precision    recall  f1-score   support

           0       0.98      0.80      0.88      7435
           1       0.22      0.77      0.35       565

    accuracy                           0.80      8000
   macro avg       0.60      0.78      0.61      8000
weighted avg       0.92      0.80      0.84      8000

C:\Users\PicardiC\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\PicardiC\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\PicardiC\AppData\Local\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

XGBoost it is the best model, because also if the accuracy is lower compared to the other models it has a recall 0.77 while the others have very low recall and precision

In [74]:
confusion_matrix(y_test.values, bst_preds)
Out[74]:
array([[5930, 1505],
       [ 132,  433]], dtype=int64)
In [75]:
tn, fp, fn, tp = confusion_matrix(y_test.values, bst_preds).ravel()
print(f"Number of true positive: {tp}, number of false negative:{fn}, number of true negatives: {tn}, number of false positive {fp}.")
Number of true positive: 433, number of false negative:132, number of true negatives: 5930, number of false positive 1505.

Features importance¶

In [76]:
sorted_idx = bst.feature_importances_.argsort()
list_of_tuples = list(zip(X_test.columns[sorted_idx], bst.feature_importances_[sorted_idx]))
df = pd.DataFrame(list_of_tuples,columns=['Features', 'Importance scores'])
print(df)
                           Features  Importance scores
0                 education_unknown           0.006056
1                           default           0.006507
2                         age_40_50           0.007290
3                  marital_divorced           0.007556
4            balance_medium_balance           0.008261
5             job_not_full_time_job           0.008733
6               education_secondary           0.008873
7                         age_30_40           0.009568
8         balance_very_high_balance           0.009723
9              job_high_profile_job           0.009845
10                   marital_single           0.010031
11                        age_50_60           0.011445
12                  marital_married           0.011711
13                             loan           0.012512
14                   job_office_job           0.012652
15                education_primary           0.014126
16             balance_high_balance           0.014973
17             duration_3_6_minutes           0.015843
18               education_tertiary           0.019177
19                        age_18_30           0.019609
20                          housing           0.020472
21                       age_over60           0.036308
22             duration_0_3_minutes           0.064564
23  duration_greater_than_6_minutes           0.654168
In [77]:
# bult-in feature importance
fig = px.bar(x=X_test.columns[sorted_idx], y=bst.feature_importances_[sorted_idx], labels={'x':'Feature', 'y':'Importance scores'})
fig

Diagnostic_graph¶

In [78]:
X_test.info()
<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 3841 to 32191
Data columns (total 24 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   default                          8000 non-null   int64
 1   housing                          8000 non-null   int64
 2   loan                             8000 non-null   int64
 3   age_18_30                        8000 non-null   bool 
 4   age_30_40                        8000 non-null   bool 
 5   age_40_50                        8000 non-null   bool 
 6   age_50_60                        8000 non-null   bool 
 7   age_over60                       8000 non-null   bool 
 8   marital_divorced                 8000 non-null   bool 
 9   marital_married                  8000 non-null   bool 
 10  marital_single                   8000 non-null   bool 
 11  education_primary                8000 non-null   bool 
 12  education_secondary              8000 non-null   bool 
 13  education_tertiary               8000 non-null   bool 
 14  education_unknown                8000 non-null   bool 
 15  duration_0_3_minutes             8000 non-null   bool 
 16  duration_3_6_minutes             8000 non-null   bool 
 17  duration_greater_than_6_minutes  8000 non-null   bool 
 18  job_high_profile_job             8000 non-null   bool 
 19  job_not_full_time_job            8000 non-null   bool 
 20  job_office_job                   8000 non-null   bool 
 21  balance_high_balance             8000 non-null   bool 
 22  balance_medium_balance           8000 non-null   bool 
 23  balance_very_high_balance        8000 non-null   bool 
dtypes: bool(21), int64(3)
memory usage: 414.1 KB
In [79]:
def make_diagnostic_graph(dict_pred, dict_act, values, ylabel):
    for ind, el in np.ndenumerate(bst_preds):
        dict_pred[values[ind[0]]].append(el)
        dict_act[values[ind[0]]].append(y_test.values[ind[0]])
    mean_pred =[]
    for key in list(dict_pred.keys()):
        if len(dict_pred[key]) != 0:
            mean_pred.append(sum(dict_pred[key])/len(dict_pred[key]))
        else:
            mean_pred.append(0)
    mean_act =[]
    for key in dict_act.keys():
        if len(dict_act[key]) != 0:
            mean_act.append(sum(dict_act[key])/len(dict_act[key]))
        else:
            mean_act.append(0)
    upper_bound = max(max(mean_pred,mean_act))+0.05
    r = [0,upper_bound]
    # Create Plot

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Bar(x=list(dict_pred.keys()), y=mean_pred, name="Avg predicted values",width=0.3),
        secondary_y=False,
    )

    fig.add_trace(
        go.Bar(x=list(dict_pred.keys()), y=mean_act, name="Avg actual values",width=0.1),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text="Diagnostic diagram"
    )

    # Set x-axis title
    fig.update_xaxes(title_text=ylabel)

    # Set y-axes titles
    fig.update_yaxes(title_text="Avg predicted values",  range=r, secondary_y=False)
    fig.update_yaxes(title_text="Avg actual values", range=r, secondary_y=True)

    fig.show()
In [80]:
def make_pred_act(dict_pred, dict_act, values):
    for ind, el in np.ndenumerate(bst_preds):
        dict_pred[values[ind[0]]].append(el)
        dict_act[values[ind[0]]].append(y_test.values[ind[0]])
    mean_pred =[]
    for key in list(dict_pred.keys()):
        if len(dict_pred[key]) != 0:
            mean_pred.append(sum(dict_pred[key])/len(dict_pred[key]))
        else:
            mean_pred.append(0)
    mean_act =[]
    for key in dict_act.keys():
        if len(dict_act[key]) != 0:
            mean_act.append(sum(dict_act[key])/len(dict_act[key]))
        else:
            mean_act.append(0)
    return mean_pred[1], mean_act[1]
In [111]:
def get_num_samples(vect):
    count = 0
    for el in vect:
        if el==1:
            count = count+1
    return count
In [81]:
zero_one_dict_pred = {0:[], 1:[]}
zero_one_dict_act = {0:[], 1:[]}
make_diagnostic_graph(zero_one_dict_pred, zero_one_dict_act, X_test.default.values,"Default")
In [82]:
zero_one_dict_pred = {0:[], 1:[]}
zero_one_dict_act = {0:[], 1:[]}
make_diagnostic_graph(zero_one_dict_pred, zero_one_dict_act, X_test.housing.values,"Housing")
In [83]:
zero_one_dict_pred = {0:[], 1:[]}
zero_one_dict_act = {0:[], 1:[]}
make_diagnostic_graph(zero_one_dict_pred, zero_one_dict_act, X_test.loan.values,"Loan")
In [112]:
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_18_30,mean_act_18_30 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_18_30.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_30_40,mean_act_30_40 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_30_40.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_40_50,mean_act_40_50 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_40_50.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_50_60,mean_act_50_60 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_50_60.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_over60,mean_act_over60 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.age_over60.values)
mean_pred = [mean_pred_18_30, mean_pred_30_40, mean_pred_40_50, mean_pred_50_60, mean_pred_over60]
mean_act = [mean_act_18_30, mean_act_30_40, mean_act_40_50, mean_act_50_60, mean_act_over60]
label =['18-30','30-40','40-50','50-60','over60']
num_samples = [get_num_samples(X_test.age_18_30.values),get_num_samples(X_test.age_30_40.values),get_num_samples(X_test.age_40_50.values),get_num_samples(X_test.age_50_60.values),get_num_samples(X_test.age_over60.values)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=label, y=mean_act, name="Avg actual values"),
    secondary_y=False,
)
fig.add_trace(
    go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
    secondary_y=True,
)
    # Add figure title
fig.update_layout(
    title_text="Diagnostic diagram"
)

# Set x-axis title
fig.update_xaxes(title_text='Age')

# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)

fig.show()
In [114]:
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_div,mean_act_div = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.marital_divorced.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_mar,mean_act_mar = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.marital_married.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act
act_sin = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.marital_single.values)
mean_pred = [mean_pred_div, mean_pred_mar, mean_pred_sin]
mean_act = [mean_act_div, mean_act_mar, mean_act_sin]
label =['Divorced','Married','Single']
num_samples = [get_num_samples(X_test.marital_divorced.values),get_num_samples(X_test.marital_married.values),get_num_samples(X_test.marital_single.values)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=label, y=mean_act, name="Avg actual values"),
    secondary_y=False,
)
fig.add_trace(
    go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
    secondary_y=True,
)
    # Add figure title
fig.update_layout(
    title_text="Diagnostic diagram"
)

# Set x-axis title
fig.update_xaxes(title_text='Marital status')

# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)

fig.show()
In [115]:
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_1,mean_act_1 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.education_primary.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_2,mean_act_2 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.education_secondary.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_3,mean_act_3 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.education_tertiary.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_4,mean_act_4 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.education_unknown.values)
mean_pred = [mean_pred_1, mean_pred_2, mean_pred_3, mean_pred_4]
mean_act = [mean_act_1, mean_act_2, mean_act_3, mean_act_4]
label =['Primary','Secondary','Tertiary','Unknown']
num_samples = [get_num_samples(X_test.education_primary.values),get_num_samples(X_test.education_secondary.values),get_num_samples(X_test.education_tertiary.values),get_num_samples(X_test.education_unknown.values)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=label, y=mean_act, name="Avg actual values"),
    secondary_y=False,
)
fig.add_trace(
    go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
    secondary_y=True,
)
    # Add figure title
fig.update_layout(
    title_text="Diagnostic diagram"
)

# Set x-axis title
fig.update_xaxes(title_text='Education')

# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)

fig.show()
In [116]:
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_1,mean_act_1 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.duration_0_3_minutes.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_2,mean_act_2 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.duration_3_6_minutes.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_3,mean_act_3 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.duration_greater_than_6_minutes.values)
mean_pred = [mean_pred_1, mean_pred_2, mean_pred_3]
mean_act = [mean_act_1, mean_act_2, mean_act_3]
label =['0-3 minutes','3-6 minutes','greater than 6 minutes']
num_samples = [get_num_samples(X_test.duration_0_3_minutes.values),get_num_samples(X_test.duration_3_6_minutes.values),get_num_samples(X_test.duration_greater_than_6_minutes.values)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=label, y=mean_act, name="Avg actual values"),
    secondary_y=False,
)
fig.add_trace(
    go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
    secondary_y=True,
)
    # Add figure title
fig.update_layout(
    title_text="Diagnostic diagram"
)

# Set x-axis title
fig.update_xaxes(title_text='Duration')

# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)

fig.show()
In [118]:
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_1,mean_act_1 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.job_high_profile_job.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_2,mean_act_2 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.job_not_full_time_job.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_3,mean_act_3 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.job_office_job.values)
mean_pred = [mean_pred_1, mean_pred_2, mean_pred_3]
mean_act = [mean_act_1, mean_act_2, mean_act_3]
label =['High-profile','Not full-time','Office']
num_samples = [get_num_samples(X_test.job_high_profile_job.values),get_num_samples(X_test.job_not_full_time_job.values),get_num_samples(X_test.job_office_job.values)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=label, y=mean_act, name="Avg actual values"),
    secondary_y=False,
)
fig.add_trace(
    go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
    secondary_y=True,
)
    # Add figure title
fig.update_layout(
    title_text="Diagnostic diagram"
)

# Set x-axis title
fig.update_xaxes(title_text='Job')

# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)

fig.show()
In [119]:
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_1,mean_act_1 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.balance_medium_balance.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_2,mean_act_2 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.balance_high_balance.values)
true_false_dict_pred = {False:[], True:[]}
true_false_dict_act = {False:[], True:[]}
mean_pred_3,mean_act_3 = make_pred_act(true_false_dict_pred, true_false_dict_act, X_test.balance_very_high_balance.values)
mean_pred = [mean_pred_1, mean_pred_2, mean_pred_3]
mean_act = [mean_act_1, mean_act_2, mean_act_3]
label =['Medium balance','High balance','Very high-balance']
num_samples = [get_num_samples(X_test.balance_medium_balance.values),get_num_samples(X_test.balance_high_balance.values),get_num_samples(X_test.balance_very_high_balance.values)]

# Create Plot

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=label, y=mean_pred, name="Avg predicted values"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=label, y=mean_act, name="Avg actual values"),
    secondary_y=False,
)
fig.add_trace(
    go.Bar(x=label, y=num_samples, name="num samples",width=0.1),
    secondary_y=True,
)
    # Add figure title
fig.update_layout(
    title_text="Diagnostic diagram"
)

# Set x-axis title
fig.update_xaxes(title_text='Balance')

# Set y-axes titles
fig.update_yaxes(title_text="Avg predicted/actual values", secondary_y=False)
fig.update_yaxes(title_text="num_samples", secondary_y=True)

fig.show()

Training and evaluate XBoost with cross-validation¶

Data are divided in two sets: data for Cross Validation, which we will call train_test and data for testing the final models, which we will call gtest for global test.

In [105]:
X_train_test, X_gtest, y_train_test, y_gtest = train_test_split(X, y, test_size=0.20, random_state=1)
In [106]:
counter = Counter(y_train_test)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print(counter[0])
print(counter[1])
print('Estimate: %.3f' % estimate)
29669
2331
Estimate: 12.728

Given the previous results, only XGBoost is considered.

In [106]:
bst = XGBClassifier(n_estimators=5000, max_depth=5, learning_rate=0.1, objective='binary:logistic',scale_pos_weight=estimate)

# Cross-validation

cv_results_bst = cross_validate(bst, X_train_test, y_train_test, cv=5, return_estimator=True)
In [107]:
model_preds = {
    #"Logistic Regression": scores_lr.mean(),
    "XGBoost":  cv_results_bst['test_score'].mean()
}

for model, scores in model_preds.items():
    print(f"{model} Mean accuracy:\n{scores}", sep="\n\n")
XGBoost Mean accuracy:
0.796125
In [108]:
def global_test_score(cv_results):
    gtest_score = []
    for i in range(len(cv_results['estimator'])):
      gtest_score.append(cv_results['estimator'][i].score(X_gtest, y_gtest))
    return gtest_score
In [109]:
model_preds = {
    #"Logistic Regression": scores_lr.mean(),
    "XGBoost":  global_test_score(cv_results_bst)
}

for model, scores in model_preds.items():
    print(f"{model} Mean accuracy:\n{sum(scores) / len(scores)}", sep="\n\n")
XGBoost Mean accuracy:
0.7994499999999999
In [110]:
print(cv_results_bst['test_score'])
[0.80640625 0.79796875 0.78625    0.79484375 0.79515625]
In [111]:
scores =  global_test_score(cv_results_bst)
print(scores)
[0.8, 0.80025, 0.797375, 0.8, 0.799625]
In [112]:
bst = cv_results_bst['estimator'][1]
bst_preds = bst.predict(X_gtest)
In [113]:
y_gtest.shape
Out[113]:
(8000,)
In [114]:
subscriber = []
for x in y_gtest:
    if x ==1:
        subscriber.append(x)
print(len(subscriber))
565
In [115]:
confusion_matrix(y_gtest.values, bst_preds)
Out[115]:
array([[5982, 1453],
       [ 145,  420]], dtype=int64)
In [116]:
tn, fp, fn, tp = confusion_matrix(y_test.values, bst_preds).ravel()
print(f"Number of true positive: {tp}, number of false negative:{fn}, number of true negatives: {tn}, number of false positive {fp}.")
Number of true positive: 420, number of false negative:145, number of true negatives: 5982, number of false positive 1453.
In [117]:
print(cv_results_bst['test_score'])
[0.80640625 0.79796875 0.78625    0.79484375 0.79515625]
In [118]:
print(f"Results:\n{classification_report(y_gtest, bst_preds)}", sep="\n\n")
Results:
              precision    recall  f1-score   support

           0       0.98      0.80      0.88      7435
           1       0.22      0.74      0.34       565

    accuracy                           0.80      8000
   macro avg       0.60      0.77      0.61      8000
weighted avg       0.92      0.80      0.84      8000

The model reach an accuracy of 80% with high precision and recall for the negative class. For the positive class we have a very low precision and a recall of 77%. The results for the posive class can be improved collecting more samples.

Conclusion¶

The unbalanced dataset make impossible create an accurate model to correctly identify positive samples. More samples of subscriber should be collected or synthetized. In order to partially balancing the dataset I used the weight matrix of the classifier. From the feature importance analysis on the best model the duration of the call grater than 6 minutes and the age over 60 seems to be key feature in order to predict subscriber. This confirm the result of the data analysis where the retired people and over 60 are more incline to buy and people with negative balance and active loans are less incline to buy. Additionally From the data analysis the number of subscriber seems also to increase with the education grade and the students are more incline to subscribe.

In [ ]:
 
In [ ]: